#include "opencl_source_map.hpp" 
namespace MNN { 
#ifndef MNN_OPENCL_BUFFER_CLOSED
const char* matmul_local_buf = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"/*\n"
" "" #define OPWM 64 // The outputsize-per-workgroup in dimension M\n"
" #define OPWN 128 // The outputsize-per-workgroup in dimension N\n"
" #define CPWK 8 // The cachesize-per-workgroup in dimension K\n"
" #define OPTM 4 // The outputsize-per-thread in dimension M\n"
" #define OPTN 8 // The outputsize-per-thread in dimension N\n"
" */\n"
"#define TPWM (OPWM/OPTM) // The threadsize-per-workgroup in dimension M\n"
"#define TPWN (OPWN/OPTN) // The threadsize-per-workgroup in dimension N\n"
"#define LPTA ((CPWK*OPWM)/(TPWM*TPWN)) // Loads-num-per-thread for A\n"
"#define LPTB ((CPWK*OPWN)/(TPWM*TPWN)) // Loads-num-per-thread for B\n"
"// vetorize+pragma unroll\n"
"__kernel void matmul_local_buf(const int M,const int N,const int K,\n"
" __global const FLOAT* A,\n"
"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
" __global const char* B,\n"
" __global const float* dequantScale,\n"
" __global const float* dequantOffset,\n"
"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
" __global const uchar* B,\n"
" __global const float* dequantScale,\n"
" __global const float* dequantOffset,\n"
"#else\n"
" __global const FLOAT* B,\n"
"#endif\n"
"#ifdef BIAS\n"
" __global const FLOAT* bias,\n"
"#endif\n"
" __global FLOAT* C) {\n"
" // Local thread id\n"
" const int lidm=get_local_id(0); // Local row ID\n"
" const int lidn=get_local_id(1); // Local col ID\n"
" // group id\n"
" const int offsetM=get_group_id(0)*OPWM; // Work-group offset M\n"
" const int offsetN=get_group_id(1)*OPWN; // Work-group offset N\n"
" // Local memory for work-group cache of A and B\n"
" __local FLOAT Alocal[CPWK][OPWM];\n"
" __local FLOAT Blocal[OPWN][CPWK+2];\n"
" // Allocate register space\n"
" COMPUTE_FLOAT sum[OPTM][OPTN];\n"
" // Initialise the accumulation registers\n"
" for (int wm=0; wm<OPTM; wm++) {\n"
" for (int wn=0; wn<OPTN; wn++) {\n"
" sum[wm][wn]=0.0f;\n"
" }\n"
" }\n"
" \n"
" // Loop over all tiles\n"
" const int numLoops=K/CPWK;\n"
" int lid=lidn*TPWM+lidm;\n"
" for (int t=0; t<numLoops; t++) {\n"
" // Load one work-group of A and B into local memory\n"
" for (int la=0; la<LPTA; la++) {\n"
" int id=la*TPWN*TPWM+lid;\n"
" int row=id % OPWM;\n"
" int col=id/OPWM;\n"
" int tiledIndex=CPWK*t+col;\n"
" #ifdef TRANSPOSE_A\n"
" // [K,M]\n"
" Alocal[col][row]=A[tiledIndex*M+(offsetM+row)];\n"
" #else\n"
" // [M,K]\n"
" Alocal[col][row]=A[(offsetM+row)*K+tiledIndex];\n"
" #endif\n"
" }\n"
" for (int la=0; la<LPTB; la++) {\n"
" int id=la*TPWN*TPWM+lid;\n"
" int row=id % OPWN;\n"
" int col=id/OPWN;\n"
" int tiledIndex=CPWK*t+col;\n"
" #ifdef TRANSPOSE_B\n"
" // [N,K]\n"
" Blocal[row][col]=B[(offsetN+row)*K+tiledIndex];\n"
" #else\n"
" // [K,N]\n"
" Blocal[row][col]=B[tiledIndex*N+offsetN+row];\n"
" #endif\n"
" }\n"
" \n"
" // Synchronise to make sure the tile is loaded\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" // Loop over the values of a single tile\n"
" \n"
" // Perform the computation\n"
" FLOAT4 A_k0,B_k0[OPTN];\n"
" {\n"
" int row=lidm;\n"
" int col=lidn;\n"
" \n"
" A_k0.s0=Alocal[0][row];\n"
" A_k0.s1=Alocal[1][row];\n"
" A_k0.s2=Alocal[2][row];\n"
" A_k0.s3=Alocal[3][row];\n"
" \n"
" #pragma unroll\n"
" for (int wn=0; wn<OPTN; wn++) {\n"
" B_k0[wn].s0=Blocal[col][0];\n"
" B_k0[wn].s1=Blocal[col][1];\n"
" B_k0[wn].s2=Blocal[col][2];\n"
" B_k0[wn].s3=Blocal[col][3];\n"
" sum[0][wn] += dot(A_k0,B_k0[wn]);\n"
" col += TPWN;\n"
" }\n"
" \n"
" #pragma unroll\n"
" for(int wm=1; wm<OPTM; wm++) {\n"
" row += TPWM;\n"
" A_k0.s0=Alocal[0][row];\n"
" A_k0.s1=Alocal[1][row];\n"
" A_k0.s2=Alocal[2][row];\n"
" A_k0.s3=Alocal[3][row];\n"
" for (int wn=0; wn<OPTN; wn++) {\n"
" sum[wm][wn] += dot(A_k0,B_k0[wn]);\n"
" }\n"
" }\n"
" }\n"
" {\n"
" int col=lidn;\n"
" for (int wn=0; wn<OPTN; wn++) {\n"
" B_k0[wn].s0=Blocal[col][4];\n"
" B_k0[wn].s1=Blocal[col][5];\n"
" B_k0[wn].s2=Blocal[col][6];\n"
" B_k0[wn].s3=Blocal[col][7];\n"
" col += TPWN;\n"
" }\n"
" int row=lidm;\n"
" for (int wm=0; wm<OPTM; wm++) {\n"
" A_k0.s0=Alocal[4][row];\n"
" A_k0.s1=Alocal[5][row];\n"
" A_k0.s2=Alocal[6][row];\n"
" A_k0.s3=Alocal[7][row];\n"
" for (int wn=0; wn<OPTN; wn++) {\n"
" sum[wm][wn] += dot(A_k0,B_k0[wn]);\n"
" }\n"
" row += TPWM;\n"
" }\n"
" }\n"
" // Synchronise before loading the next tile\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" // Store the final results in C\n"
" for (int wm=0; wm<OPTM; wm++) {\n"
" int globalRow=offsetM+lidm+wm*TPWM;\n"
" for (int wn=0; wn<OPTN; wn++) {\n"
" int globalCol=offsetN+lidn+wn*TPWN;\n"
" #ifdef BIAS\n"
" sum[wm][wn] += bias[globalCol];\n"
" #endif\n"
" C[globalRow*N+globalCol]=sum[wm][wn];\n"
" }\n"
" }\n"
"}\n"
"// double buffer\n"
"__kernel void matmul_local_double_buf(const int M,const int N,const int K,\n"
" __global const FLOAT* A,\n"
"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
" __global const char* B,\n"
" __global const float* dequantScale,\n"
" __global const float* dequantOffset,\n"
"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
" __global const uchar* B,\n"
" __global const float* dequantScale,\n"
" __global const float* dequantOffset,\n"
"#else\n"
" __global const FLOAT* B,\n"
"#endif\n"
"#ifdef BIAS\n"
" __global const FLOAT* bias,\n"
"#endif\n"
" __global FLOAT* C) {\n"
" // Local thread id\n"
" const ushort lidm=get_local_id(0); // Local row ID\n"
" const ushort lidn=get_local_id(1); // Local col ID\n"
" // group id\n"
" const ushort offsetM=get_group_id(0)*OPWM; // Work-group offset M\n"
" const ushort offsetN=get_group_id(1)*OPWN; // Work-group offset N\n"
" // Local memory for work-group cache of A and B\n"
" __local FLOAT AlocalR[CPWK][OPWM];\n"
" __local FLOAT BlocalR[OPWN][CPWK+2];\n"
" __local FLOAT AlocalC[CPWK][OPWM];\n"
" __local FLOAT BlocalC[OPWN][CPWK+2];\n"
" \n"
" // Allocate register space\n"
" COMPUTE_FLOAT sum[OPTM][OPTN];\n"
" // Initialise the accumulation registers\n"
" for (ushort wm=0; wm<OPTM; wm++) {\n"
" for (ushort wn=0; wn<OPTN; wn++) {\n"
" sum[wm][wn]=0.0f;\n"
" }\n"
" }\n"
" \n"
" // Loop over all tiles\n"
" const ushort numLoops=K/CPWK;\n"
" ushort lid=lidn*TPWM+lidm;\n"
" for (ushort t=0; t<numLoops; t++) {\n"
" // Load one work-group of A and B into local memory\n"
" for (ushort la=0; la<LPTA; la++) {\n"
" ushort id=la*TPWN*TPWM+lid;\n"
" ushort row=id % OPWM;\n"
" ushort col=id/OPWM;\n"
" ushort tiledIndex=CPWK*t+col;\n"
" #ifdef TRANSPOSE_A\n"
" // [K,M]\n"
" AlocalR[col][row]=A[tiledIndex*M+(offsetM+row)];\n"
" #else\n"
" // [M,K]\n"
" AlocalR[col][row]=A[(offsetM+row)*K+tiledIndex];\n"
" #endif\n"
" }\n"
" for (ushort la=0; la<LPTB; la++) {\n"
" ushort id=la*TPWN*TPWM+lid;\n"
" ushort row=id % OPWN;\n"
" ushort col=id/OPWN;\n"
" ushort tiledIndex=CPWK*t+col;\n"
" #ifdef TRANSPOSE_B\n"
" // [N,K]\n"
" BlocalR[row][col]=B[(offsetN+row)*K+tiledIndex];\n"
" #else\n"
" // [K,N]\n"
" BlocalR[row][col]=B[tiledIndex*N+offsetN+row];\n"
" #endif\n"
" }\n"
" \n"
" // Synchronise to make sure the tile is loaded\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" // Loop over the values of a single tile\n"
" \n"
" // Perform the computation\n"
" FLOAT4 A_k0,B_k0[OPTN];\n"
" {\n"
" ushort row=lidm;\n"
" ushort col=lidn;\n"
" \n"
" A_k0.s0=AlocalR[0][row];\n"
" A_k0.s1=AlocalR[1][row];\n"
" A_k0.s2=AlocalR[2][row];\n"
" A_k0.s3=AlocalR[3][row];\n"
" \n"
" #pragma unroll\n"
" for (ushort wn=0; wn<OPTN; wn++) {\n"
" B_k0[wn].s0=BlocalR[col][0];\n"
" B_k0[wn].s1=BlocalR[col][1];\n"
" B_k0[wn].s2=BlocalR[col][2];\n"
" B_k0[wn].s3=BlocalR[col][3];\n"
" sum[0][wn] += dot(A_k0,B_k0[wn]);\n"
" col += TPWN;\n"
" }\n"
" \n"
" #pragma unroll\n"
" for(ushort wm=1; wm<OPTM; wm++) {\n"
" row += TPWM;\n"
" A_k0.s0=AlocalR[0][row];\n"
" A_k0.s1=AlocalR[1][row];\n"
" A_k0.s2=AlocalR[2][row];\n"
" A_k0.s3=AlocalR[3][row];\n"
" for (ushort wn=0; wn<OPTN; wn++) {\n"
" sum[wm][wn] += dot(A_k0,B_k0[wn]);\n"
" }\n"
" }\n"
" }\n"
" {\n"
" int col=lidn;\n"
" for (ushort wn=0; wn<OPTN; wn++) {\n"
" B_k0[wn].s0=BlocalR[col][4];\n"
" B_k0[wn].s1=BlocalR[col][5];\n"
" B_k0[wn].s2=BlocalR[col][6];\n"
" B_k0[wn].s3=BlocalR[col][7];\n"
" col += TPWN;\n"
" }\n"
" ushort row=lidm;\n"
" for (ushort wm=0; wm<OPTM; wm++) {\n"
" A_k0.s0=AlocalR[4][row];\n"
" A_k0.s1=AlocalR[5][row];\n"
" A_k0.s2=AlocalR[6][row];\n"
" A_k0.s3=AlocalR[7][row];\n"
" for (ushort wn=0; wn<OPTN; wn++) {\n"
" sum[wm][wn] += dot(A_k0,B_k0[wn]);\n"
" }\n"
" row += TPWM;\n"
" }\n"
" }\n"
" \n"
" t++;\n"
" // Loop over the values of a single tile\n"
" // Load one work-group of A and B into local memory\n"
" for (ushort la=0; la<LPTA; la++) {\n"
" ushort id=la*TPWN*TPWM+lid;\n"
" ushort row=id % OPWM;\n"
" ushort col=id/OPWM;\n"
" ushort tiledIndex=CPWK*t+col;\n"
" #ifdef TRANSPOSE_A\n"
" // [K,M]\n"
" AlocalC[col][row]=A[tiledIndex*M+(offsetM+row)];\n"
" #else\n"
" // [M,K]\n"
" AlocalC[col][row]=A[(offsetM+row)*K+tiledIndex];\n"
" #endif\n"
" }\n"
" for (ushort la=0; la<LPTB; la++) {\n"
" ushort id=la*TPWN*TPWM+lid;\n"
" ushort row=id % OPWN;\n"
" ushort col=id/OPWN;\n"
" ushort tiledIndex=CPWK*t+col;\n"
" #ifdef TRANSPOSE_B\n"
" // [N,K]\n"
" BlocalC[row][col]=B[(offsetN+row)*K+tiledIndex];\n"
" #else\n"
" // [K,N]\n"
" BlocalC[row][col]=B[tiledIndex*N+offsetN+row];\n"
" #endif\n"
" }\n"
" // Synchronise to make sure the tile is loaded\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" // Perform the computation\n"
" {\n"
" ushort row=lidm;\n"
" ushort col=lidn;\n"
" \n"
" A_k0.s0=AlocalC[0][row];\n"
" A_k0.s1=AlocalC[1][row];\n"
" A_k0.s2=AlocalC[2][row];\n"
" A_k0.s3=AlocalC[3][row];\n"
" \n"
" #pragma unroll\n"
" for (ushort wn=0; wn<OPTN; wn++) {\n"
" B_k0[wn].s0=BlocalC[col][0];\n"
" B_k0[wn].s1=BlocalC[col][1];\n"
" B_k0[wn].s2=BlocalC[col][2];\n"
" B_k0[wn].s3=BlocalC[col][3];\n"
" sum[0][wn] += dot(A_k0,B_k0[wn]);\n"
" col += TPWN;\n"
" }\n"
" \n"
" #pragma unroll\n"
" for(ushort wm=1; wm<OPTM; wm++) {\n"
" row += TPWM;\n"
" A_k0.s0=AlocalC[0][row];\n"
" A_k0.s1=AlocalC[1][row];\n"
" A_k0.s2=AlocalC[2][row];\n"
" A_k0.s3=AlocalC[3][row];\n"
" for (ushort wn=0; wn<OPTN; wn++) {\n"
" sum[wm][wn] += dot(A_k0,B_k0[wn]);\n"
" }\n"
" }\n"
" }\n"
" {\n"
" ushort col=lidn;\n"
" for (ushort wn=0; wn<OPTN; wn++) {\n"
" B_k0[wn].s0=BlocalC[col][4];\n"
" B_k0[wn].s1=BlocalC[col][5];\n"
" B_k0[wn].s2=BlocalC[col][6];\n"
" B_k0[wn].s3=BlocalC[col][7];\n"
" col += TPWN;\n"
" }\n"
" ushort row=lidm;\n"
" for (ushort wm=0; wm<OPTM; wm++) {\n"
" A_k0.s0=AlocalC[4][row];\n"
" A_k0.s1=AlocalC[5][row];\n"
" A_k0.s2=AlocalC[6][row];\n"
" A_k0.s3=AlocalC[7][row];\n"
" for (ushort wn=0; wn<OPTN; wn++) {\n"
" sum[wm][wn] += dot(A_k0,B_k0[wn]);\n"
" }\n"
" row += TPWM;\n"
" }\n"
" }\n"
" }\n"
" // Store the final results in C\n"
" for (ushort wm=0; wm<OPTM; wm++) {\n"
" ushort globalRow=offsetM+lidm+wm*TPWM;\n"
" for (ushort wn=0; wn<OPTN; wn++) {\n"
" ushort globalCol=offsetN+lidn+wn*TPWN;\n"
" #ifdef BIAS\n"
" sum[wm][wn] += bias[globalCol];\n"
" #endif\n"
" C[globalRow*N+globalCol]=sum[wm][wn];\n"
" }\n"
" }\n"
"}\n"
;
#endif
}
